import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from interpret import show
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import set_visualize_provider
from interpret.provider import InlineProvider
set_visualize_provider(InlineProvider())
# import data
df = pd.read_csv("./data/CEE_DATA.csv", quotechar="'")
X = df[
[
"Gender",
"Caste",
"coaching",
"time",
"Class_ten_education",
"twelve_education",
"medium",
"Class_X_Percentage",
"Class_XII_Percentage",
"Father_occupation",
"Mother_occupation",
]
]
Y = df["Performance"]
# split data
seed = 1
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=0.3, random_state=seed
)
print("Train Size Instances: ", X_train.shape[0])
print("Test Size Instances:", X_test.shape[0])
Train Size Instances: 466 Test Size Instances: 200
ohe = OneHotEncoder(sparse=False)
Xoded = pd.DataFrame(ohe.fit_transform(X),columns=ohe.get_feature_names(['Gender', 'Caste', 'coaching', 'time', 'Class_ten_education',
'twelve_education', 'medium', 'Class_X_Percentage',
'Class_XII_Percentage', 'Father_occupation', 'Mother_occupation']))
X_train_enc, X_test_enc, Y_train_enc, Y_test_enc = train_test_split(
Xoded, Y, test_size=0.3, random_state=seed
)
print("Train Size Instances: ", X_train_enc.shape[0])
print("Test Size Instances:", X_test_enc.shape[0])
Train Size Instances: 466 Test Size Instances: 200
# https://interpret.ml/docs/ebm.html?highlight=multiclass#
ebm = ExplainableBoostingClassifier(random_state=seed)
ebm.fit(X_train, Y_train)
# global explanations
ebm_global = ebm.explain_global()
show(ebm_global)
/Users/shyaman/miniconda3/envs/ml/lib/python3.9/site-packages/interpret/glassbox/ebm/ebm.py:800: UserWarning: Multiclass is still experimental. Subject to change per release.
warn("Multiclass is still experimental. Subject to change per release.")
/Users/shyaman/miniconda3/envs/ml/lib/python3.9/site-packages/interpret/glassbox/ebm/ebm.py:803: UserWarning: Detected multiclass problem: forcing interactions to 0
warn("Detected multiclass problem: forcing interactions to 0")
# local explanations
ebm_local = ebm.explain_local(X_test[:5], Y_train[:5])
show(ebm_local)
# https://interpret.ml/docs/lr.html
from interpret.glassbox import LogisticRegression
lr = LogisticRegression(random_state=seed)
lr.fit(X_train_enc, Y_train_enc)
lr_global = lr.explain_global()
show(lr_global)
/Users/shyaman/miniconda3/envs/ml/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:763: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
# seems this one doesn't support strings for y labels
# lr_local = lr.explain_local(X_test_enc[:5], Y_test_enc[:5])
# show(lr_local)
from interpret.glassbox import ClassificationTree
dt = ClassificationTree(random_state=seed)
dt.fit(X_train_enc, Y_train_enc)
dt_global = dt.explain_global()
show(dt_global)
# seems this one doesn't support strings for y labels
# dt_local = dt.explain_local(X_test_enc[:5], Y_test_enc[:5])
# show(dt_local)
from interpret.glassbox import DecisionListClassifier
dl = DecisionListClassifier(random_state=seed)
dl.fit(X_train_enc, Y_train_enc)
dl_global = dl.explain_global()
show(dl_global)
/Users/shyaman/miniconda3/envs/ml/lib/python3.9/site-packages/skrules/skope_rules.py:214: UserWarning:
Found labels {0, 1, 2, 3}. This method assumes target class to be labeled as 1 and normal data to be labeled as 0. Any label different from 0 will be considered as being from the target class.
# seems this one doesn't support strings for y labels
# dl_local = dl.explain_local(X_test_enc[:5], Y_test_enc[:5])
# show(dl_local)